We will use data from the NCAA basketball tournament from 2011 - 2016.
hoops <- read_csv('http://www.math.montana.edu/ahoegh/teaching/stat408/datasets/TourneyDetailedResults.csv')
hoops_2011 <- hoops %>% filter(Season >= 2011)
hoops_2011## # A tibble: 402 x 34
## Season Daynum Wteam Wscore Lteam Lscore Wloc Numot Wfgm Wfga Wfgm3 Wfga3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011 134 1155 70 1412 52 N 0 26 50 4 13
## 2 2011 134 1421 81 1114 77 N 1 27 54 4 12
## 3 2011 135 1427 70 1106 61 N 0 23 54 4 16
## 4 2011 135 1433 59 1425 46 N 0 20 59 9 24
## 5 2011 136 1139 60 1330 58 N 0 22 54 7 26
## 6 2011 136 1140 74 1459 66 N 0 24 61 6 22
## 7 2011 136 1153 78 1281 63 N 0 29 54 4 11
## 8 2011 136 1163 81 1137 52 N 0 32 66 9 24
## 9 2011 136 1196 79 1364 51 N 0 29 53 8 23
## 10 2011 136 1211 86 1385 71 N 0 28 52 9 15
## # … with 392 more rows, and 22 more variables: Wftm <dbl>, Wfta <dbl>,
## # Wor <dbl>, Wdr <dbl>, Wast <dbl>, Wto <dbl>, Wstl <dbl>, Wblk <dbl>,
## # Wpf <dbl>, Lfgm <dbl>, Lfga <dbl>, Lfgm3 <dbl>, Lfga3 <dbl>, Lftm <dbl>,
## # Lfta <dbl>, Lor <dbl>, Ldr <dbl>, Last <dbl>, Lto <dbl>, Lstl <dbl>,
## # Lblk <dbl>, Lpf <dbl>
points <- hoops_2011 %>% group_by(Season) %>%
summarise(Win.Points = mean(Wscore), Lose.Points = mean(Lscore),
Win.3Pt = mean(Wfgm3), Lose.3pt = mean(Lfgm3))
head(points)## # A tibble: 6 x 5
## Season Win.Points Lose.Points Win.3Pt Lose.3pt
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011 73.2 61.9 7.18 5.96
## 2 2012 71.4 61.5 5.97 5.93
## 3 2013 72.2 59.3 6.70 5.31
## 4 2014 73.9 62.9 6.19 5.46
## 5 2015 72.9 62.6 6.34 6.22
## 6 2016 78.3 65.4 7.18 6.52
plot(points$Lose.Points~points$Season,
ylim=c(0,max(points$Lose.Points)),
ylab='Average Points Scored', xlab='Season',
type='n',
main='Average Points Scored by Losing Teams \n in NCAA Tournament',
cex.lab=.8, cex.main = .8)
lines(points$Lose.Points~points$Season,col='red',lwd=1)
lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2)plot(points$Lose.Points~points$Season,
ylim=c(0,max(points$Win.Points)),
ylab='Average Points Scored', xlab='Season',
type='n',
main='Average Points Scored in NCAA Tournament',
cex.lab=.8, cex.main = .8)
lines(points$Lose.Points~points$Season,col='red',lwd=1)
lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2)
legend('bottomright',legend=c('Winning Team','Losing Team'),
col=c('blue','red'),
lwd=c(1,1), lty=c(2,1), cex = .7)plot(points$Lose.Points~points$Season,
ylim=c(0,max(hoops_2011$Wscore)),
ylab='Average Points Scored', xlab='Season',
type='l', lwd=1,col='red',
main='Average Points Scored in NCAA Tournament',
cex.lab=.8, cex.main = .8)
lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2)
points(hoops_2011$Wscore~hoops_2011$Season,pch=16,col=rgb(0,0,.5,.1))
legend('bottomright',
legend=c('Average Winning Team','Average Losing Team', 'Individual Winning Team'),
col=c('blue','red',rgb(0,0,.5)),
lwd=c(1,1,NA), lty=c(2,1,NA), pch=c(NA,NA,16), cex=.7)plot(points$Lose.Points~points$Season,
ylim=c(0,max(hoops_2011$Wscore)),
ylab='Average Points Scored', xlab='Season', type='l', lwd=1,col='red',
main='Average Points Scored in NCAA Tournament',cex.lab=.8, cex.main = .8)
lines(points$Win.Points~points$Season,col='blue',lwd=1,lty=2)
points(hoops_2011$Wscore~hoops_2011$Season,pch=16,col=rgb(0,0,.5,.1))
text(2014.5,40,'Shot Clock Shortened', cex=.6)
arrows(x0=2015, y0=42, x1=2016, y1=70, length=0.1, lwd=2)
legend('bottomright',
legend=c('Average Winning Team','Average Losing Team', 'Individual Winning Team'),
col=c('blue','red',rgb(0,0,.5)),
lwd=c(1,1,NA), lty=c(2,1,NA), pch=c(NA,NA,16), cex=.6)plot(points$Lose.Points~points$Season,
ylim=c(0,max(hoops_2011$Wscore)), axes=F,
ylab='Average Points Scored', xlab='Season',
type='b',pch=16, lwd=1,col='red',
main='Average Points Scored in NCAA Tournament',
cex.main=.75, cex.lab=.75)
lines(points$Win.Points~points$Season,col='blue',
lwd=1,lty=2,type='b',pch=17)
legend('bottomright',
legend=c('Average Winning Team','Average Losing Team'),
col=c('blue','red'),
lwd=1, lty=c(2,1),pch=c(17,16), cex=.7)plot(points$Lose.Points~points$Season,
ylim=c(0,max(hoops_2011$Wscore)), axes=F,
ylab='Average Points Scored', xlab='Season',
type='b',pch=16, lwd=1,col='red',
main='Average Points Scored in NCAA Tournament',
cex.main=.75, cex.lab=.75)
lines(points$Win.Points~points$Season,
col='blue',lwd=3,lty=2,type='b',pch=17)
legend('bottomleft',
legend=c('Average Winning Team','Average Losing Team'),
col=c('blue','red'),
lwd=1, lty=c(2,1),pch=c(17,16), cex=.7)
axis(4)
axis(1, at = 2011:2016,
labels=c('10-11','11-12','12-13','13-14','14-15','15-16'))plot(points$Lose.Points~points$Season,
ylim=c(0,max(hoops_2011$Wscore)), axes=F,
ylab='Average Points Scored', xlab='Season',
type='b',pch=16, lwd=1,col='red',
main='Average Points Scored in NCAA Tournament',
cex.main=.75, cex.lab=.75)
lines(points$Win.Points~points$Season,
col='blue',lwd=3,lty=2,type='b',pch=17)
legend('bottomleft',
legend=c('Average Winning Team','Average Losing Team'),
col=c('blue','red'),
lwd=1, lty=c(2,1),pch=c(17,16), cex=.7)
axis(4)
axis(1, at = 2011:2016,
labels=c('10-11','11-12','12-13','13-14','14-15','15-16'))
box()Use the Seattle Housing Data Set http://math.montana.edu/ahoegh/teaching/stat408/datasets/SeattleHousing.csv to create an interesting graphic, include informative titles, labels, and add an annotation.
## Parsed with column specification:
## cols(
## price = col_double(),
## bedrooms = col_double(),
## bathrooms = col_double(),
## sqft_living = col_double(),
## sqft_lot = col_double(),
## floors = col_double(),
## waterfront = col_double(),
## sqft_above = col_double(),
## sqft_basement = col_double(),
## zipcode = col_double(),
## lat = col_double(),
## long = col_double(),
## yr_sold = col_double(),
## mn_sold = col_double()
## )
hist(seattle_in$price,prob=T,breaks="FD", ylab='',
col='forestgreen',
xlab='Sales Price (million $)',
main='Houses Sold in Seattle', axes=F)
axis(1, at = c(0,500000,1000000,2500000,4000000,5500000,
7000000),
labels =c('0','.5','1','2.5','4','5.5','7'))
arrows(x0=2500000, y0=1.5e-6, x1=1500000, y1=.5e-6,
length=0.1, lwd=2)
text(2500000,1.8e-6,
'Most homes sell for \n less than one million',cex=.8)plot(density(hoops_2011$Wscore),
ylab=expression(beta[2]),xlab='',
main='Examples with Expression',axes=F, type='n')
box()
text(70,.023, expression(sum(theta[i]^2, i=1, n)),cex=2)Why ggplot2?
The basic idea: independently specify plot building blocks and combine them to create just about any kind of graphical display you want.
Building blocks of a graph include:
Compared to base graphics, ggplot2
Aesthetics are things that you can see. Examples include:
Aesthetic mappings are set with the aes() function.
Geometric objects are the actual marks we put on a plot. Examples include:
geom_point)geom_line)geom_boxplot)A plot must have at least one geom; there is no upper limit. You can add a geom to a plot using the + operator
geom_point()geom_smooth()geom_rug()geom_density2d()geom_jitter()labs()graph.a + geom_rug() + geom_density2d() +
geom_jitter() +
labs(x='Losing Team Field Goals Made',
y = 'Winning Team Field Goals Made')xlim() and ylim()graph.a + geom_rug() + geom_density2d() +
geom_jitter() +
labs(x='Losing Team Field Goals Made',
y = 'Winning Team Field Goals Made') +
xlim(c(0,max(hoops_2011$Wfgm))) + ylim(c(0,max(hoops_2011$Wfgm)))There are a wide range of themes available in ggplot: theme overview
Now use ggplot2 to create an interesting graph using the Seattle Housing data set.
seattle_in$zipcode <- as.factor(seattle_in$zipcode)
graph.a <- ggplot(data = seattle_in, aes(sqft_living,price))
graph.a + geom_jitter(aes(col = zipcode))+
theme(plot.title = element_text(size=8),
text = element_text(size=6)) + geom_smooth(method='loess')+
ggtitle('Seattle Housing Sales: Price vs. Square Footage Living Space') +
ylab('Sales Price (million dollars)') +
xlab('Living Space (square foot)')+
scale_y_continuous(breaks=c(seq(0,7000000,by=1000000)),
labels=as.character(0:7)) +
annotate('text',3500,6000000,
label = 'Housing price depends on zipcode', size=2) +
annotate("rect", xmin = 0, xmax = 7250, ymin = 5500000, ymax = 6500000, alpha = .6) +
geom_segment(aes(x=3500, xend=3500, y=5500000, yend=3000000),
arrow = arrow(length = unit(0.5, "cm")))